# -*- coding: utf-8 -*-
"""Untitled237.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1BKZvpwUJJzlPCBC3hLMVxPNhyNazHTad
"""

import pandas as pd  #Install the relevant packages. We need packages to view the variable importance, address class imbalance, split and train the dataset, and also provide random forest performance metrics.
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# This section involves importing the dataset
df = pd.read_csv("/content/Penzance Machone Training Variables.csv")

# Indicates the basic View and Inspect the dataset. Please note that before the dataset was imported various factors such as outliers, multicolinearity, and data normality were investigated
df.head()
df.info()
df.describe()
df.isnull()
df.isnull().sum()


# Separating the target variables from the feature variables
X = df.iloc[:, :-1]  # this means you are separating all the columns from your target column (overtopping)
Y = df.iloc[:, -1]   # we are separating the overtopping variable from the other feature variables
X.shape
Y.shape

# Splitting the dataset into testing and training using the default parameters but this can be changed
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2,random_state=99) # This is for when you split into testing and training you are ramdonly selecting rows of data, avioding bias

# Building a random forest model for a binary classification problem
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(criterion="gini",
                             max_depth=8,
                             min_samples_split=10,
                             random_state=5) # a gini impurity closer to 0 indicates a beterr model fit.

clf.fit(X_train, y_train) # This is for when you split into testing and training you are ramdonly selecting rows of data, avioding bias


# overview of the predictions
y_pred = clf.predict(X_test)

# Cross-validation to confirm our origional predictions
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_train, y_train, cv=10)
print(scores)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


# Number of trees within the random forest model
n_estimators = [50, 100, 150]
# Total number of features to features to consider at each split
max_features = ['log2', 'sqrt']
# Number of levels in decsion tree
max_depth = [None, 10, 20, 30]
# Total number of samples needed to split a node
min_samples_split = [10, 20, 40]
# Minimum sample number required at each leaf node
min_samples_leaf = [1, 2, 4, 8]
#Method for selecting the majorty vote for each decision tree binary classification
bootstrap = [True, False]


# Creating a systematic grid search
param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap}

print(param_grid)

# Hyperparameter Tuning Metrics

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Assuming param_grid is already defined and you want to use it for RandomizedSearchCV
param_distributions = {
    'n_estimators': [50, 100, 150],
    'max_features': ['log2', 'sqrt'],
    'max_depth': [None, 10, 30],
    'min_samples_split': [10, 40],
    'min_samples_leaf': [1, 8],
    'bootstrap': [True, False]
}

# Instantiate the model
rf_Model = RandomForestClassifier()

# Use RandomizedSearchCV for hyperparameter tuning
rf_Grid = RandomizedSearchCV(estimator=rf_Model, param_distributions=param_distributions, cv=3, verbose=2, n_iter=10, random_state=42, n_jobs=-1)
rf_Grid.fit(X_train, y_train)

# Print the best parameters found
print(rf_Grid.best_params_)

# Check accuracy
print(f'Train Accuracy - : {rf_Grid.score(X_train, y_train):.3f}')
print(f'Test Accuracy - : {rf_Grid.score(X_test, y_test):.3f}')

# Random forest model with the best hyperparameter tuning metrics on the testing dataset
Random_Forest_Best_Tuned_Model = rf_Grid.best_estimator_.predict(X_test)

# Lets investigatie the classification report
report = classification_report(y_test, Random_Forest_Best_Tuned_Model)
print(report) # we want to increase the F1 score by adjusting the threshold value

# Using our best model we then predict the model estimates on the testing dataset for (-1) which is our overtopping
y_different_threshold_values = rf_Grid.best_estimator_.predict_proba(X_test)[:, 1]

# this is simply the different threshold levels we are using
thresholds = np.arange(0.01, 1.0, 0.01)
precision_scores = []
recall_scores = []
f1_scores = []

# We now evalaute the model performance for these different metrics: Precision, Recall and F1-Score (with the particular attention to the F1 value)
for threshold in thresholds:
    y_pred_threshold = (y_different_threshold_values >= threshold).astype(int)

    # Calculating the model performance metrics
    from sklearn.metrics import classification_report, precision_recall_fscore_support

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_threshold, average='binary')
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)


# This line of code creates a table with the different threshold scores along with the different model performances
for i, threshold in enumerate(thresholds):
    print(f"Threshold: {threshold:.2f}, Precision: {precision_scores[i]:.2f}, Recall: {recall_scores[i]:.2f}, F1 Score: {f1_scores[i]:.2f}")


from sklearn.metrics import confusion_matrix





#DIAL 1 - ADJUST THE HARMONIC F1

# We now adjust the optimal random forest threshold
threshold = 0.7  # FOR THE TUNED MODEL








# Using the optimal threshold established above lets predict the random forest model performance using this specific threshold value
y_pred = (rf_Grid.best_estimator_.predict_proba(X_test)[:, 1] > threshold).astype('float')

# The model performance indicated using a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


!pip install joblib # we need this package to save our finalised model
import joblib








#DIAL 2 - ADJUST THE HARMONIC F1 

# Remember rf_Grid.best_estimator_ is our best hyperparameter tuned model at the best threshold valuation
model_details = {
    'model': rf_Grid.best_estimator_,
    'threshold': 0.5 # our best threshold   # FOR THE FINAL MODEL
}






# Save the model
joblib.dump(model_details, 'Our_random_forest_model')


# Load "Our_random_forest_model"
loaded_model_details = joblib.load('Our_random_forest_model')
threshold = loaded_model_details['threshold']
Prediction_Probability = loaded_model_details['model'].predict_proba(X_test)[:, 1] > threshold
y_pred = (Prediction_Probability).astype(int)


import joblib
from sklearn.model_selection import cross_val_score # we must verify if our best hyperparameter tuned model performs similar under cross-validation

# lets load the best tuned random forest model
Best_tuned_model = joblib.load('Our_random_forest_model')
model = Best_tuned_model['model']

# Establish how many k-fold cross validation is needed
cv_folds = 10

# Perform cross-validation and record the results for the different folds
scores = cross_val_score(model, X, Y, cv=cv_folds, scoring='accuracy')
print("The different cross-validation scores:", scores)
print("Average cross-validation scores:", scores.mean())


#Final Summary of our predictions using our best tuned model with the threshold value set to 0.2
import joblib
from sklearn.metrics import classification_report, accuracy_score

# again load the best model
loaded_model_details = joblib.load('Our_random_forest_model')
model = loaded_model_details['model']
threshold = loaded_model_details['threshold']

# Predict probabilities for the test set
y_predictions_for_best_random_forest_model = model.predict_proba(X_test)[:, 1]

# Make sure you apply the 0.2 theshold
y_pred = (y_predictions_for_best_random_forest_model > threshold).astype(int)

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# further classification report for the best tuned model
report = classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1'])
print("Classification Report:\n", report)


from sklearn.metrics import matthews_corrcoef

# Assuming y_test are your true labels and y_pred are your predictions
# Calculate the Matthews Correlation Coefficient
mcc = matthews_corrcoef(y_test, y_pred)

print("Matthews Correlation Coefficient:", mcc)



# Assuming 'model' is your trained model and 'X_test' is your test dataset
y_pred_probs = model.predict_proba(X_test)[:, 1]  # probabilities for the positive class